'''
Created on 2017��6��24��

@author: GaoHui
'''
import datetime
import hashlib
import re
from urllib import parse
from scrapy.http import Request
import scrapy
from scrapy import cmdline
from items import JobboleArticleItem, ArticleItemLoader

if __name__ == '__main__':
    cmdline.execute("scrapy crawl jobbole".split())


def get_md5(url):
    if isinstance(url, str):  # 判断是不是str，其实是判断是不是Unicode，python3中默认是Unicode编码
        url = url.encode("utf-8")  # 转换成utf-8，哈希只认utf-8
    m = hashlib.md5()
    m.update(url)
    return m.hexdigest()


class JobboleSpider(scrapy.Spider):
    name = 'jobbole'
    allowed_domains = ['python.jobbole.com']
    start_urls = ['http://python.jobbole.com/all-posts/']

    def parse(self, response):
        article_nodes = response.css('#archive .floated-thumb .post-thumb a')
        for node in article_nodes:
            image_url = node.css("img::attr(src)").extract_first("")
            if not image_url.startswith("http:") and not image_url.startswith("https:"):
                image_url = "http:" + image_url
            post_url = node.css("::attr(href)").extract_first("")
            print(image_url, post_url)
            yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse_loader_detail,
                          meta={"front_image_url": image_url})

        next_url = response.css("#archive .navigation .next::attr(href)").extract_first("")
        if next_url:
            yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)

    def parse_detail(self, response):
        item = JobboleArticleItem()
        create_date = response.css(".entry-meta p.entry-meta-hide-on-mobile::text").extract_first('').strip().replace(
            "·", "").strip()
        '''
        try:
            create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
        except Exception as e:
            create_date = datetime.datetime.now().date()
        '''
        fav_nums = response.css(".post-adds span.bookmark-btn::text").extract_first('')
        match_re = re.match(".*?(\d+).*", fav_nums)
        if match_re:
            fav_nums = int(match_re.group(1))
        else:
            fav_nums = 0
        comment_nums = response.css("a[href='#article-comment'] span::text").extract_first('')
        match_re = re.match(".*?(\d+).*", comment_nums)
        if match_re:
            comment_nums = int(match_re.group(1))
        else:
            comment_nums = 0

        tags = response.css("p.entry-meta-hide-on-mobile a::text").extract_first('')
        tags = [tag for tag in tags if not tag.strip().endswith("评论")]
        tags = ",".join(tags)
        item["front_image_url"] = [response.meta.get("front_image_url")]
        item["url_object_id"] = get_md5(response.url)
        item["title"] = response.css(".entry-header h1::text").extract_first('')
        item["create_date"] = create_date
        item["url"] = response.url
        item["praise_nums"] = response.css(".post-adds .vote-post-up h10::text").extract_first('')
        item["comment_nums"] = comment_nums
        item["fav_nums"] = fav_nums
        item["tags"] = tags
        item["content"] = response.css("div.entry").extract_first('')
        yield item

    def parse_loader_detail(self, response):
        # item = JobboleArticleItem()
        front_image_url = response.meta.get("front_image_url", "")
        item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value('front_image_url', [response.meta.get("front_image_url")])
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", ".entry-meta p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("url", response.url)
        item_loader.add_css("praise_nums", ".post-adds .vote-post-up h10::text")
        item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", ".post-adds span.bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_xpath('content','//div[@class="entry"]')
        article_item = item_loader.load_item()
        yield article_item

